\documentclass[10pt,a4paper]{article}
\usepackage{graphicx}
\usepackage[english]{babel}
\usepackage{fullpage}
\usepackage{latexsym}
\usepackage{amssymb}
\usepackage[T1]{fontenc}
\usepackage[sc]{mathpazo}
\linespread{1.05}
\usepackage{subfig}

% NOG DOEN:
%
% OPGAVE 1: [M] -opgave c moet nog gedaan. ik weet niet hoe....						 
%	opgave 2:	[L]	eventueel nog proberen/kijken naar analitische manier om 2b uit te rekenen
% OPGAVE 3: [M] in a ergens een e-macht toegevoegd, die was je denk ik vergeten, er staat een opm bij
%							--> nee die was ik niet vergeten, die valt namelijk weg als je er de log van neemt en de haakjes van de log staan alleen om de term ervoor. Ik heb je verandering dus weer ongedaan gemaakt. Wat we wel zouden kunnen doen om het duidelijker te maken is een regel ervoor plaatsen met de emacht en de log die dat wel meeneemt. Dit heb ik nu gedaan, maar als je wilt mag je deze ook wel weer weghalen (eerste regel)
% 							VERDER DENK IK DAT OPG 3 OOK WEL KLAAR IS-> mee eens!
%

\title{Applied Statistics\\ Report on the problems}
\author{Mieke Hiltermann and Lotte van den Berg \\ Utrecht University}

\newcommand{\mco}{\mathcal{O}}
\newcommand{\mcg}{\mathcal{G}}
\newcommand{\mcv}{\mathcal{V}}
\newcommand{\mce}{\mathcal{E}}
\newcommand{\mcn}{\mathcal{N}}
\newcommand{\real}{\mathbb{R}}

\begin{document}

% \begin{titlepage}
\maketitle 

% \tableofcontents

\section*{Problem 1}
\textit{A pilot study of new process for making a jet aircraft engine yields observations of the vane diameter (measured in inches), which are recorded in the datafile vanes.txt. The data consists of 20 rational subgroups, each of size 5. The customer requires that the diameters must be  between 0.5005 and 0.5055 inches. The ideal value of the diameter is 0.5030 inches.}



% --------------------------------------------------- opgave 1C -----------------------------------------------------------------------
\subsection*{(c)}
\textit{Compute a $\beta$-expectation tolerance interval that contains $99.73 \%$ of the underlying distribution of the data. Compare these intervals with the results of
(b).}\\[.5cm]

\noindent We want a tolerance interval of the underlying distribution of the data. In (a) we showed that the data has a normal distribution, so we will make a tolerance interval with the normal densityfunction $f(x) = \frac{1}{sqrt{2\pi \sigma^2}}e^{\frac{1}{2\sigma^2} (x-\mu)^2}$. But then we need of course estimators for $\mu$ and $\sigma$. For the unbiased estimators we use:
$$ \widehat{\mu}_{MLE} = \frac{1}{n} \sum_{i=1}^{n} X_i \overline{X}$$
$$ \widehat{\sigma}^2_{MLE} = \frac{1}{n-1} \sum_{i=1}^{n} (X_i-\overline{X})^2 $$

\begin{verbatim}
# Compute unbiased estimators for \sigma^2 and \mu;
> mu <- mean(vanes$V1)
> mu
[1] 0.503332
> sigma2 <- 100/99*var(vanes$V1)
> sigma <- sqrt(sigma2)
> sigma
[1] 0.0003316086

# smallest beta-expectation tolerance interval for standard normal:
> qnorm((1-0.9973)/2)
[1] -2.999977
> qnorm(1-(1-0.9973)/2)
[1] 2.999977

# smallest beta-expectation tolerance interval for estimated normal distribution:
> mu + qnorm((1-0.9973)/2)*sigma
[1] 0.5023372
> mu + qnorm(1-(1-0.9973)/2)*sigma
[1] 0.5043268
\end{verbatim}

\noindent Thus the $\beta$-expectation tolerance interval of confidencelevel $0.9973$ is $(0.5023;0.5043)$.\\
\\
\noindent We want to compare this interval with the interval we found in (b), but there the interval is based on the group-data. Those data are smoother, so therefore it is reasonable that this interval -$(0.5030;0.5037)$- is smaller than the interval we found here $(0.5023;0.5043)$.

% --------------------------------------------------------------------------------------------------------------------------
\subsection*{(d)}
\textit{It is required by customers that the production process has a $C_{pk}$ of $2.0$. Discuss whether the pilot study indicates that the process is sufficiently capable. If
the current process is not sufficiently capable, discuss whether the pilot study indicates that with minor changes the process will be able to meet the capability
requirements.}\\[.5cm]
In our process $C_{pk}=2.774$, therfore the pilot study is not sufficiently stable for the requirement of a $C_{pk}$ value of $2.00$. Therefore we want to let this capability indices go to $2.0$. For a non-centered process with normal distribution, for which it holds that $\frac{1}{2}(USL+LSL)\leq \mu\leq USL$ we have 
\[
C_{pk}=\frac{USL-\mu}{3\sigma}.
\]
In order to let this amount decrease while keeping the USL the same (because the customer still requires the same specification limits for the diameters) either the mean of the process has to increase or the variation of the process has to decrease. In this case it is not desirable to let the mean of the process increase because the center of the process is already above the ideal value. Therefore in order to get a lower $C_{pk}$ value the variance needs to decrease. Usually it is very hard to reduce the variance, this usually involves a major change of the production process. Probably it is not possible to mee the capability requirements with minor changes in the process.\\

\subsection*{(e)}
\textit{An engineer claims that it is not unlikely that during mass production the mean diameter of the vanes shifts by an amount of $0.0005$ inches, while the variance keeps its current value. Calculate the impact on the capability indices of such
a change.}\\[.5cm]
Define $\delta:=0.0005$. Suppose the mean has positively shifted over a distance of $\delta$. Define $\mu:=mean+\delta$.
Then:
\begin{eqnarray*}
C_p		&=&\frac{USL-LSL}{6\sigma}\\
 			&=&3.198679\\
C_{pk}&=&\min\big(\frac{USL-\mu}{3\sigma}, \frac{\mu-LSL}{3\sigma}\big)\\
			&=&	2.134159.\\
\end{eqnarray*}
While the capability indices were $3.199$ and $2.774$ respectively. The $C_p$ value should stay exactly the same when only the mean shift, the change in this indices is due to round off mistakes. The change in $C_{pk}=\frac{2.134159}{2.774}=0.7693435=77\%$.\\
If we suppose the mean has negatively shifted over a distance of $\delta$, and define $\mu:=mean-\delta$. Then we get:
\begin{eqnarray*}
C_p		&=&\frac{USL-LSL}{6\sigma}\\
 			&=&3.198679\\
C_{pk}&=&\min\big(\frac{USL-\mu}{3\sigma}, \frac{\mu-LSL}{3\sigma}\big)\\
			&=&	2.983728.\\
\end{eqnarray*}
This time we have a change of $C_{pk}=\frac{2.983728}{2.774}=1.075605=108\%$.\\
\begin{verbatim}
#defining the variables
mean 		<-	0.503332 
StdDev	<-	0.0002605242
delta		<-	0.0005
LSL		<-	0.5005  
USL		<-	0.5055 
Cporg		<-	3.199 
Cpkorg	<-	2.774 

#suppose a positive shift
mupos		<-	mean+delta
Cppos 	<-	(USL-LSL)/(6*StdDev)
Cpkpos	<-	min((USL-mupos)/(3*StdDev),(mupos-LSL)/(3*StdDev))

#suppose a negative shift
muneg		<-	mean-delta
Cpneg 	<-	(USL-LSL)/(6*StdDev)
Cpkneg	<-	min((USL-muneg)/(3*StdDev),(muneg-LSL)/(3*StdDev))

#change
Cpkpos/Cpkorg
Cpkneg/Cpkorg
\end{verbatim}

\subsection*{(f)}
\textit{Set up two-sided ``improved data driven control charts'' (with in case of the minimum chart $m = 3$), see Albers, W. and Kallenberg, W.C.M. (2006). Improved
data driven control charts. TW-Report No. 1791, using the 100 observations as
individual observations (ignore the group-structure) for the following situations:\\
\begin{enumerate}
	\item use corrections to reduce the bias \\
	\item use corrections for the exceedance probability approach with $\varepsilon = 0.1$ and
				$ \alpha = 0.2$. \\
\end{enumerate}
Take in any case $p = 0.002$. Clearly indicate how you arrived at the control
limits.}\\[.5cm]

\noindent
The  observations given in \textit{`vanes.txt'} are seen as individual Phase I observations $X_1,\ldots,X_n$ with $n=100$, these obseravations are used to estimate the parameters. First of all we need to see if the data is normally distributed. In order to do this we need to define $X_{(1)}:=\min(X_1,\ldots,X_n)$ and $X_{(n)}:=\max(X_1,\ldots,X_n)$ the mean and variation are based on the Phase I observations in the following way
\begin{eqnarray*}
\overline X&=&\frac{1}{n}\sum_{i=1}^nX_i\\
S^2&=&\frac{1}{n-1}\sum_{i=1}^{n}(X_i-\overline X)^2.\\
\end{eqnarray*}
We can say the data is normally distributed when $d_1\leq\frac{X_{(n)}-\overline X}{S}\leq d_2$ and $d_1\leq\frac{\overline X- X_{(1)}}{S}\leq d_2$, with $\overline X$ the center of the data and $d_1$ and $d_2$ defined as follows:
\begin{eqnarray*}
d_1	&=& u_{\frac{-0.7+0.5\log n}{n}}\\
		&=&	\Phi^{-1}(1-\frac{-0.7+0.5\log n}{n})\\
d_2	&=&	u_{\frac{5}{n\sqrt n}}\\
		&=&	\Phi^{-1}(1-\frac{5}{n\sqrt n}).\\
\end{eqnarray*}
Later on we can use the improved estimated upper control limit of a normal distribution when $d_1\leq\frac{X_{(n)}-\overline X}{S}\leq d_2$ and we can use the improved estimated lower control limit of a normal distribution when $d_1\leq\frac{\overline X- X_{(1)}}{S}\leq d_2$. In order to check this we inserted the following lines in R.\\
\begin{verbatim}
> vanes <- read.table("vanes.txt")
> n<-100
> 
> x1 <-min(vanes$V1)
> x100 <- max(vanes$V1)
> 
> samplenumber<- rep(1:n, each=1)
> vanesdata <- qcc.groups(vanes$V1, samplenumber)
> stats.xbar(vanesdata)
$statistics
[1] 0.503332

$center
[1] 0.503332

> xbar=0.503332
> S <-sd.xbar(vanesdata)
> 
> d1 <- qnorm((1-(-0.7+0.5*log(n))/n))
> d2<-qnorm(1-(5/(n*n^(1/2))))
> 
> LL<-(xbar-x1)/S
> UL<-(x100-xbar)/S
> 
> d1<=LL && LL<=d2
[1] TRUE
> d1<=UL && UL<=d2
[1] FALSE
> UL<=d2
[1] FALSE
\end{verbatim}
It is showed that the UL is not smaller or equal to $d_2$. Better inspection learns that $d_2=2.575829$ and $UL=2.926410$. But we should not forget that in (a) we saw this data was not in-control because it did have two outliers. Exact these outliers are equal to $X_{(1)}$ and $X_{(n)}$ resulting in wrong conclusions. Therefore we assume that this data is normally distributed, just as we assumed in (a).\\
\\
We want to have a False Alarm Rate, $p=0.002$. The Average Run Length and the FAR are dependent of the Phase I observations, therefore they have become random variables. These stochastic values are written in the following way $FAR=P_n$ and $ARL=\frac{1}{P_n}$, these should be close to $p$ and $\frac{1}{p}$ respectively.\\
This can be done in two ways: by reducing the bias ($E(P_n)-p$) or by looking at the exceedance probabilities ($P(P_n>p(1+\varepsilon))<\alpha$ with $\varepsilon=0.1$, and $\alpha=0.2$).\\
\\
To get a small bias or exceedence probability huge samples are needed when estimators of the parameters are simply plugged into the control limits. Corrections of the control limits are available reducing the bias and exceedence probabilities sufficiently well for common size samples. \\
\\
Call the distribution function of the observations $F$, and define $\overline F:=1-F$. Normally we have call the $UCL=\overline F^{\,-1}(p)$ and $LCL=F^{-1}(p)$ for the normal chart. From this we get $FAR=p$. We write
Thus giving uncorrected control limits $\overline X \pm u_{\frac{p}{2}}S$, with $u_{\frac{p}{2}}=\overline{\Phi}^{\,-1}(\frac{p}{2})$.\\
\\
If our aim is to get $E(P_n)=p$ (so to reduce the bias) then we should take as $LCL$ and $UCL$ the following quantities:
\begin{eqnarray*}
(LCL,UCL)_{bias}	&=&	\big (\overline X - u_{\frac{p}{2}}S(1+\frac{u_{\frac{p}{2}}^2+3}{4n}), \overline X + 				   			u_{\frac{p}{2}}S(1+\frac{u_{\frac{p}{2}}^2+3}{4n})\big)\\
					&=&(0.5022777, 0.5043863)
\end{eqnarray*}
This was obtained with the following lines in R.
\begin{verbatim}
> p<-0.002
> LCLbias<- xbar - qnorm(1-p/2)*S*(1+((qnorm(1-p/2))^2+3)/(4*n))
> UCLbias<-xbar + qnorm(1-p/2)*S*(1+((qnorm(1-p/2))^2+3)/(4*n))
\end{verbatim}
\noindent
If our aim is to get $P(P_{LCL}>\frac{p}{2}(1+\varepsilon))\leq \alpha$, and $P(P_{UCL}>\frac{p}{2}(1+\varepsilon))\leq \alpha$ then we should have the following quantities for the specification limits.
\begin{eqnarray*}
(LCL,UCL)_{exc}&=&(\overline X - u_{\frac{p}{2}}S (1+\frac{u_{\alpha}(\frac{1}{2}+u_{\frac{p}{2}}^{-2})^{\frac{1}{2}}}{\sqrt{n}} -\frac{\varepsilon}{u_{\frac{p}{2}}^2}), \overline X + u_{\frac{p}{2}}S (1+\frac{u_{\alpha}(\frac{1}{2}+u_{\frac{p}{2}}^{-2})^{\frac{1}{2}}}{\sqrt{n}} -\frac{\varepsilon}{u_{\frac{p}{2}}^2})\\
&=&(0.5020478, 0.5046162)
\end{eqnarray*}
\begin{verbatim}
> epsilon<-0.1
> alpha<-0.2
> LCLexc<- xbar - qnorm(1-p/2)*S*(1 + ((qnorm(1-alpha)*(1/2+(qnorm(1-p/2))^2)^(1/2)) /(n^(1/2)))-epsilon/((qnorm(1-p/2))^2))
> UCLexc<- xbar + qnorm(1-p/2)*S*(1 + ((qnorm(1-alpha)*(1/2+(qnorm(1-p/2))^2)^(1/2)) /(n^(1/2)))-epsilon/((qnorm(1-p/2))^2))
\end{verbatim}
The resulting control charts with the control limits for the bias reduction and the exceedence probability are shown in \ref{fig:improved}

\begin{figure*}[h]
\caption{Data driven control charts}\label{fig:improved}
\begin{center}
\subfloat[Bias reduction]{\label{fig:bias}%
	\includegraphics[width=7cm]{opg1-xbarbias.jpg}}\qquad
\subfloat[Exceedence probabiliy]{\label{fig:exc}%
	\includegraphics[width=7cm]{opg1-xbarexc.jpg}}\\
\end{center}
\end{figure*}

\begin{verbatim}
vanesdata 	<-	qcc.groups(vanes$V1,vanes$V2)
vanesqccbias<-	qcc(vanesdata, type="xbar", st.dev=S, center=xbar, limits=c(LCLbias,UCLbias))
vanesqccexc	<-	qcc(vanesdata, type="xbar", st.dev=S, center=xbar, limits=c(LCLexc,UCLexc))
\end{verbatim}
It is clear from both charts that the FAR is very much reduced since the corrections are used.\\


\subsection*{(g)}
\textit{(Here we do not use the data file.) Investigate which building block (the normal,
the parametric or the minimum chart with m = 3) is chosen by the improved
data driven control chart. Consider only the upper control limit. Perform a
simulation study with $n = 100$. \\
\noindent Take $10000$ simulations (each time $100$ observations) from:
\begin{enumerate}
\item the standard normal distribution 
\item the exponential distribution with mean 1. 
\end{enumerate}
\noindent Report for both situations how many times the normal, the parametric or the
minimum chart is chosen and give comments on the results.}\\[.5cm]
\noindent
The method wich is used here is described in \textit{Albers, W. and Kallenberg, W.C.M. (2006). Improved 
data driven control charts. TW-Report No. 1791}. 
\noindent
The idea behind the selection rule described in the article is to stay as long as possuble in the classical normal chart, to move to the parametric chart it the tail are too heavy or too light and to take the non-paramteric MIN chart when the parametric family presumably fails too.\\ 
The data is telling us which chart to use. If we only consider the upper limit then the data tells us which chart to use in the following way:
If 
\[
d_{1N}\leq \frac{X_{(n)}-\overline X}{S}\leq d_{2N}
\]
then the data is normally distributed. If however:
\[
d_{1P}(\hat\gamma_u)\leq\frac{X_{(n)}-\overline X}{S}\leq d_{2P}(\hat\gamma_u)
\]
then the data is distributed in a parametric way. If both equations are not true then we need to use the non-parametric MIN chart.\\
The paramameters are calculated in the following way:
\begin{eqnarray*}
d_{1N}&=&u_{\frac{-0.7+0.5\log n}{n}}\\
d_{2N}&=&u_{\frac{5}{n\sqrt n}}\\
d_{1P}(\hat\gamma)&=&c(\hat\gamma)u_{\frac{-0.2+0.5\log n}{n}}^{1+\hat\gamma}\\
d_{2P}(\hat\gamma)&=&c(\hat\gamma)u_{\frac{3}{n\sqrt n}}^{1+\hat \gamma}\\
u_p&=&\overline\Phi^{\,-1}(p)=\Phi^{-1}(1-p)\\
c(\gamma)&=&\pi^{\frac{1}{4}}2^{-\frac{1+\gamma}{2}}\Gamma(\gamma+\frac{3}{2})^{-\frac{1}{2}}\\
\hat\gamma_u&=&1.1218\log\big(\frac{X_{ent(0.95n+1)}-\overline X}{X_{ent(0.75n+1)}-\overline X}\big)-1\\
n&=&100\\
\end{eqnarray*}

\noindent $10.000$ simulations from the standard normal distribution:
\begin{verbatim}
>   setwd("C:/Users/Jef/Desktop/Applied Statistics")
> library(qcc)
# voor standaard normale variabelen:
n		<-	100
d1N		<-	qnorm(1-(-0.7+0.5*log(n))/n)
d2N		<-	qnorm(1-(5/(n*n^(1/2))))
Norm <- 0
Param <- 0
Minimal <- 0

for(i in 1:10000){
  #trek een sample van 100 standaard normaal verdeelde rv
  NormSample <- rnorm(n)
  #orden deze op grootte : X_(1)<...<X_(n) 
  SortNormSample <- sort(NormSample)
  #bereken mean en S (standard deviatie van deze 100 stochasten)
  Mean <- mean(SortNormSample)
  StDev <- sd(SortNormSample)
  #als d1N<=(X_(100)-mean)/S<=d2N dan is deze normaal verdeeld
  if(d1N<=(SortNormSample[100]-Mean)/StDev && (SortNormSample[100]-Mean)/StDev<= d2N){
    Norm <- Norm + 1
  }
  else{
    Gamma <- 1.1218 * log((SortNormSample[96]-Mean)/(SortNormSample[76]-Mean))-1
    GammaFunctie <- pi^(1/4)*2^(-(1+Gamma)/2)*gamma((Gamma+3/2)^(-1/2))
    d1P <- GammaFunctie*(qnorm(1-(-0.2+0.5*log(n))/n))^(1+Gamma)
    d2P <- GammaFunctie*(qnorm(1-3/(n*n^(1/2))))^(1+Gamma)
    if(d1P<=(SortNormSample[100]-Mean)/StDev&& (SortNormSample[100]-Mean)/StDev <= d2P){
      Param <- Param + 1
    }
    else {
      Minimal <- Minimal + 1
    }
  }  
}
> Norm
[1] 4555
> Param
[1] 2960
> Minimal
[1] 2485
\end{verbatim}
When we do this a few times we find the following values for how many times the normal, the parametric and the minimum chart is chosen: \\

\begin{tabular}{|c|c|c|c|}
\hline
simulation number		& Normal chart 		& Parametric chart		& Minimum Chart \\
\hline
1										& 4555						& 2960								& 2485					\\
2										& 4674						& 2924								& 2402					\\
3										& 4659						& 2901								& 2440					\\
\hline
\end{tabular}
\\
\noindent
From this table we can conclude that the way in which the selection of the distribution takes place maybe isn't the most optimal way. All these data sets should habe been a normal chart, although almost half of the distributions are seen as normal, there are too much distributions that are called parametric and even minimum charts.\\
\\
\noindent
With observations from the exponential distribution with mean $1$ we get:
\begin{verbatim}
# voor exponentieel verdeelde variabelen mean 1:
n		<-	100
d1N		<-	qnorm(1-(-0.7+0.5*log(n))/n)
d2N		<-	qnorm(1-(5/(n*n^(1/2))))
Norm <- 0
Param <- 0
Minimal <- 0

for(i in 1:10000){
  #trek een sample van 100 exponentieel verdeelde rv's met mean 1
  NormSample <- rexp(n)
  #orden deze op grootte : X_(1)<...<X_(n) 
  SortNormSample <- sort(NormSample)
  #bereken mean en S (standard deviatie van deze 100 stochasten)
  Mean <- mean(SortNormSample)
  StDev <- sd(SortNormSample)
  #als d1N<=(X_(100)-mean)/S<=d2N dan is deze normaal verdeeld
  if(d1N<=(SortNormSample[100]-Mean)/StDev && (SortNormSample[100]-Mean)/StDev<= d2N){
    Norm <- Norm + 1
  }
  else{
    Gamma <- 1.1218 * log((SortNormSample[96]-Mean)/(SortNormSample[76]-Mean))-1
    GammaFunctie <- pi^(1/4)*2^(-(1+Gamma)/2)*gamma((Gamma+3/2)^(-1/2))
    d1P <- GammaFunctie*(qnorm(1-(-0.2+0.5*log(n))/n))^(1+Gamma)
    d2P <- GammaFunctie*qnorm(1-3/(n*n^(1/2)))^(1+Gamma)
    if(d1P<=(SortNormSample[100]-Mean)/StDev&&(SortNormSample[100]-Mean)/StDev <= d2P){
      Param <- Param + 1
    }
    else {
      Minimal <- Minimal + 1
    }
  }  
}
> Norm
[1] 14
> Param
[1] 5730
> Minimal
[1] 4256
\end{verbatim}

\begin{tabular}{|c|c|c|c|}
\hline
simulation number		& Normal chart 		& Parametric chart		& Minimum Chart \\
\hline
1										& 14							& 5730								& 4256					\\
2										& 23							& 5660								& 4317					\\
3										& 21 							& 5699								& 4280					\\
\hline
\end{tabular}
\\

For an exponential distribution this selection rule works a slightly better then for a normal distribution. Now at least more then half of the distributions are seen as parametric, but still a lot are seen as non-parametric.
From this we can conclude this selection rule is not very precise.


\end{document}
